"""
The Social Cost of Fiscal Federalism and the
Depletion of China’s Native Forests

Haoyu Wang hywang@vt.edu
"""
import pandas as pd
import os

#Change the following to the directory containing the Excel Data files
path = "C:/Users/lenovo/Desktop/data"
os.chdir(path)

df0 = pd.read_excel('excel0.xlsx')
df2 = pd.read_excel('excel2.xlsx')
df4 = pd.read_excel('excel4.xlsx')
df5 = pd.read_excel('excel5.xlsx')
df6 = pd.read_excel('excel6.xlsx')
df7 = pd.read_excel('excel7.xlsx')
df9 = pd.read_excel('excel9.xlsx')
df10 = pd.read_excel('excel10.xlsx')
df11 = pd.read_excel('excel11.xlsx')
df12 = pd.read_excel('excel12.xlsx')
df14 = pd.read_excel('excel14.xlsx')
df15 = pd.read_excel('excel15.xlsx')
df20 = pd.read_excel('excel20.xlsx')
inflation=pd.read_excel('deflator.xlsx')

df12 = df12.rename(columns={'jcode': 'fbid'})
df14 = df14.rename(columns={'jcode': 'fbid'})


dfs = {'df2': df2, 'df4': df4, 'df5': df5,'df7': df7,
       'df9': df9,'df10': df10,'df11': df11,'df12': df12,'df14': df14,
       'df20': df20
       }

#check duplicates
for label, dataframe in dfs.items():
    dupcount=dataframe[dataframe.duplicated(subset=['fbid', 'year'])].shape[0]
    print("check ", label)
    if dupcount>0:
        print(f"{label} has duplicates")

dfs1 = {'df6': df6, 'df15': df15}
   
for label, dataframe in dfs1.items():
    dupcount=dataframe[dataframe.duplicated(subset=['jcode', 'year'])].shape[0]
    print("check ", label)
    if dupcount>0:
        print(f"{label} has duplicates")

    
dupcount0=df0[df0.duplicated(subset=['fbid'])].shape[0]
if dupcount0>0:
    print("df0 has duplicates")


df12=df12[['fbid','year','a53']].drop_duplicates()     
df12=df12.dropna()

print("check number of duplicates ater dedup (should be 0): ",df12[df12.duplicated(subset=['fbid','year'])].shape[0])
print(df12[df12.duplicated(subset=['fbid','year'])])
    
    
df = pd.merge(df2, df4, on=['fbid','year'], how='outer')
df = pd.merge(df, df5, on=['fbid','year'], how='outer')
df = pd.merge(df, df6, on=['jcode','year'], how='outer')
df = pd.merge(df, df7, on=['fbid','year'], how='outer')
df = pd.merge(df, df9, on=['fbid','year'], how='outer')
df = pd.merge(df, df10, on=['fbid','year'], how='outer')
df = pd.merge(df, df11, on=['fbid','year'], how='outer')
df = pd.merge(df, df12, on=['fbid','year'], how='outer')
df = pd.merge(df, df14, on=['fbid','year'], how='outer')
df = pd.merge(df, df15, left_on=['fbid','year'], right_on=['jcode','year'], how='outer')
df = pd.merge(df, df20, on=['fbid','year'], how='outer')
df = pd.merge(df, df0, on='fbid', how='outer')

df = pd.merge(df, inflation, on='year', how='left')


df = df.rename(columns={
                        'ya1': 'tax',
                        'ya13': 'gamma',
                        'Cbb2': 'xbar'
                        })

df['tau']=(df['tax']*10000)/(df['fst_indpv']*1000)
df['G'] = df[['D3_2_22', 'D3_2_25', 'D3_2_27']].sum(axis=1, skipna=True)/10000

df['P'] = df['a53']/10000

df.sort_values(by=['fbid', 'year'], ascending=[True, True], inplace=True)

df=df.reset_index(drop=True)

duplicates = df[df.duplicated(subset=['fbid', 'year'])].shape[0]

print("Duplicates in 'fbid' and 'year':\n", duplicates)

df.info()

df['prov'] = df['fbid'].astype(str).str[0].astype(int)


print(df[(df['year']==1980) & (df['fbid']==10311)]['xbar'])
print(df[(df['year']==2004) & (df['fbid']==30101)]['xbar'])

df['tau'] = df.groupby('fbid')['tau'].bfill()



df['Pmiss']=pd.isnull(df['P'])

df['P'] = df.groupby('fbid')['P'].transform(
    lambda group: group.interpolate(method='linear').ffill().bfill()
)

df['Gmiss']=pd.isnull(df['G'])


df['G'] = df.groupby('fbid')['G'].transform(
    lambda group: group.interpolate(method='linear').ffill().bfill()
)

df.to_csv('df.csv', index=False)

sm=df[['fbid','year','G','P','tau','xbar']].dropna()



sm[['G','P','tau','xbar']].describe()

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = sm.quantile(0.25)
Q3 = sm.quantile(0.75)

# Calculate the IQR
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
sm = sm[~((sm['P'] < lower_bound['P']) | (sm['P'] > upper_bound['P']))]

sm = sm[(sm['G'] > 0)]

sm = sm[~((sm['tau'] < 0) | (sm['tau'] > 1))]

sm.to_csv('sm.csv', index=False)

